library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(datarium)
library(ggplot2)
library(ggpubr)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
library(cluster)    
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mlbench)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

This is the dataset used in the second chapter of Aurélien Géron’s recent book ‘Hands-On Machine learning with Scikit-Learn and TensorFlow’. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome.

The data contains information from the 1990 California census. So although it may not help you with predicting current housing prices, it does provide an accessible introductory dataset for teaching people about the basics of machine learning.

The columns are as follows, their names are pretty self explanitory: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, median_house_value, ocean_proximity


# Load the data
calihousing <- read_csv("housing.csv")
## Rows: 20640 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ocean_proximity
## dbl (9): longitude, latitude, housing_median_age, total_rooms, total_bedroom...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
housing <- as.data.frame(calihousing)

housing1 <- housing[, -which(names(housing) == "ocean_proximity" )]
housing2 <- housing1[, -which(names(housing) == "total_bedrooms" )]

dim(housing2)
## [1] 20640     8
head(housing2)
##   longitude latitude housing_median_age total_rooms population households
## 1   -122.23    37.88                 41         880        322        126
## 2   -122.22    37.86                 21        7099       2401       1138
## 3   -122.24    37.85                 52        1467        496        177
## 4   -122.25    37.85                 52        1274        558        219
## 5   -122.25    37.85                 52        1627        565        259
## 6   -122.25    37.85                 52         919        413        193
##   median_income median_house_value
## 1        8.3252             452600
## 2        8.3014             358500
## 3        7.2574             352100
## 4        5.6431             341300
## 5        3.8462             342200
## 6        4.0368             269700
# the first step before to build a model
cor(housing2)
##                      longitude    latitude housing_median_age total_rooms
## longitude           1.00000000 -0.92466443        -0.10819681  0.04456798
## latitude           -0.92466443  1.00000000         0.01117267 -0.03609960
## housing_median_age -0.10819681  0.01117267         1.00000000 -0.36126220
## total_rooms         0.04456798 -0.03609960        -0.36126220  1.00000000
## population          0.09977322 -0.10878475        -0.29624424  0.85712597
## households          0.05531009 -0.07103543        -0.30291601  0.91848449
## median_income      -0.01517587 -0.07980913        -0.11903399  0.19804965
## median_house_value -0.04596662 -0.14416028         0.10562341  0.13415311
##                      population  households median_income median_house_value
## longitude           0.099773223  0.05531009  -0.015175865        -0.04596662
## latitude           -0.108784747 -0.07103543  -0.079809127        -0.14416028
## housing_median_age -0.296244240 -0.30291601  -0.119033990         0.10562341
## total_rooms         0.857125973  0.91848449   0.198049645         0.13415311
## population          1.000000000  0.90722227   0.004834346        -0.02464968
## households          0.907222266  1.00000000   0.013033052         0.06584265
## median_income       0.004834346  0.01303305   1.000000000         0.68807521
## median_house_value -0.024649679  0.06584265   0.688075208         1.00000000
str(housing2)
## 'data.frame':    20640 obs. of  8 variables:
##  $ longitude         : num  -122 -122 -122 -122 -122 ...
##  $ latitude          : num  37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age: num  41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms       : num  880 7099 1467 1274 1627 ...
##  $ population        : num  322 2401 496 558 565 ...
##  $ households        : num  126 1138 177 219 259 ...
##  $ median_income     : num  8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value: num  452600 358500 352100 341300 342200 ...
summary(housing2)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##    population      households     median_income     median_house_value
##  Min.   :    3   Min.   :   1.0   Min.   : 0.4999   Min.   : 14999    
##  1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634   1st Qu.:119600    
##  Median : 1166   Median : 409.0   Median : 3.5348   Median :179700    
##  Mean   : 1425   Mean   : 499.5   Mean   : 3.8707   Mean   :206856    
##  3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432   3rd Qu.:264725    
##  Max.   :35682   Max.   :6082.0   Max.   :15.0001   Max.   :500001
# descriptive statistics
des <- stat.desc(housing2)
round(des, 3)
##                 longitude   latitude housing_median_age  total_rooms
## nbr.val         20640.000  20640.000          20640.000    20640.000
## nbr.null            0.000      0.000              0.000        0.000
## nbr.na              0.000      0.000              0.000        0.000
## min              -124.350     32.540              1.000        2.000
## max              -114.310     41.950             52.000    39320.000
## range              10.040      9.410             51.000    39318.000
## sum          -2467918.700 735441.620         591119.000 54402150.000
## median           -118.490     34.260             29.000     2127.000
## mean             -119.570     35.632             28.639     2635.763
## SE.mean             0.014      0.015              0.088       15.185
## CI.mean.0.95        0.027      0.029              0.172       29.764
## var                 4.014      4.562            158.396  4759445.106
## std.dev             2.004      2.136             12.586     2181.615
## coef.var           -0.017      0.060              0.439        0.828
##                population  households median_income median_house_value
## nbr.val         20640.000 2.06400e+04     20640.000       2.064000e+04
## nbr.null            0.000 0.00000e+00         0.000       0.000000e+00
## nbr.na              0.000 0.00000e+00         0.000       0.000000e+00
## min                 3.000 1.00000e+00         0.500       1.499900e+04
## max             35682.000 6.08200e+03        15.000       5.000010e+05
## range           35679.000 6.08100e+03        14.500       4.850020e+05
## sum          29421840.000 1.03105e+07     79890.650       4.269504e+09
## median           1166.000 4.09000e+02         3.535       1.797000e+05
## mean             1425.477 4.99540e+02         3.871       2.068558e+05
## SE.mean             7.883 2.66100e+00         0.013       8.032200e+02
## CI.mean.0.95       15.450 5.21600e+00         0.026       1.574374e+03
## var           1282470.457 1.46176e+05         3.609       1.331615e+10
## std.dev          1132.462 3.82330e+02         1.900       1.153956e+05
## coef.var            0.794 7.65000e-01         0.491       5.580000e-01

Now lets do a Regression model

# try to understand the relationship of features with house value

gg1=ggplot(housing2, aes(median_income, median_house_value) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x) +
  ggtitle("Simple linear regression model") +
  xlab("Median income") + ylab("Average Price of the houses")

gg2=ggplot(housing2, aes(housing_median_age, median_house_value) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)+
  ggtitle("Simple linear regression model") +
  xlab("Average age of the houses") + ylab("Average Price of the houses")

gg3=ggplot(housing2, aes(total_rooms, median_house_value) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)+
  ggtitle("Simple linear regression model") +
  xlab("Number of bedrooms") + ylab("Average Price of the houses")

figure <- ggarrange(gg1,gg2,gg3,
                    labels = c("A", "B", "C"),
                    ncol = 2, nrow = 2)

figure # the relationships seem quite linear

# Build the MLR model
mod <- lm(median_house_value ~ housing_median_age + median_income + total_rooms , data = housing2 )
# Check the distribution of residuals
hist(mod$residuals) # Histogram of residuals

qqnorm(mod$residuals) # Q-Q plot of residuals# Shapiro-Wilk test for normality of residuals

# Build the Log transformation model to fix the normality of residual
mod_log <- lm(log(median_house_value) ~ housing_median_age + median_income + total_rooms, data = housing2)

# Check the distribution of residuals again
hist(mod_log$residuals)  # Histogram of residuals

qqnorm(mod_log$residuals)  # Q-Q plot of residuals

# Diagnostic plots for heteroscedasticity
# Residual vs. Fitted Values plot
plot(mod, which = 1)

# Scale-Location plot (Square root of standardized residuals vs. Fitted Values)
plot(mod, which = 3)

# Residuals vs. Leverage plot
plot(mod, which = 5)

# Apply weighted least squares regression to fic heteroscedasticity
weights <- 1/sqrt(abs(mod$residuals))
mod_wls <- lm(median_house_value ~ housing_median_age + median_income + total_rooms, data = housing2, weights = weights)
# test if it works or not
lmtest::bptest(mod_wls)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod_wls
## BP = 4.9739, df = 3, p-value = 0.1737
# Build the Log model
mod2 <- glm(median_house_value ~ ocean_proximity, data = housing)
# Summarize the model
summary(mod)
## 
## Call:
## lm(formula = median_house_value ~ housing_median_age + median_income + 
##     total_rooms, data = housing2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -584934  -53506  -15123   36448  450986 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.433e+04  2.160e+03  -11.26   <2e-16 ***
## housing_median_age  1.975e+03  4.780e+01   41.32   <2e-16 ***
## median_income       4.247e+04  3.012e+02  140.98   <2e-16 ***
## total_rooms         3.888e+00  2.793e-01   13.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80480 on 20636 degrees of freedom
## Multiple R-squared:  0.5137, Adjusted R-squared:  0.5136 
## F-statistic:  7266 on 3 and 20636 DF,  p-value: < 2.2e-16
summary(mod2)
## 
## Call:
## glm(formula = median_house_value ~ ocean_proximity, data = housing)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -236712   -66247   -21005    42273   375196  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 240084       1054 227.804  < 2e-16 ***
## ocean_proximityINLAND      -115279       1631 -70.686  < 2e-16 ***
## ocean_proximityISLAND       140356      45062   3.115  0.00184 ** 
## ocean_proximityNEAR BAY      19128       2354   8.125 4.71e-16 ***
## ocean_proximityNEAR OCEAN     9350       2220   4.212 2.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 10147556380)
## 
##     Null deviance: 2.7483e+14  on 20639  degrees of freedom
## Residual deviance: 2.0939e+14  on 20635  degrees of freedom
## AIC: 534137
## 
## Number of Fisher Scoring iterations: 2
summary(mod_wls)
## 
## Call:
## lm(formula = median_house_value ~ housing_median_age + median_income + 
##     total_rooms, data = housing2, weights = weights)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -21318  -3251   -995   2906  17634 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.983e+04  1.452e+03  -20.54   <2e-16 ***
## housing_median_age  1.930e+03  3.258e+01   59.25   <2e-16 ***
## median_income       4.330e+04  2.084e+02  207.72   <2e-16 ***
## total_rooms         3.733e+00  1.987e-01   18.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4299 on 20636 degrees of freedom
## Multiple R-squared:  0.6987, Adjusted R-squared:  0.6987 
## F-statistic: 1.595e+04 on 3 and 20636 DF,  p-value: < 2.2e-16

The symbol *** on the MLR model summary denotes a p-value less than 0.001, indicating a highly significant coefficient. The F-statistic in our situation is quite big, and the corresponding p-value is less than 2e-16 (nearly zero), suggesting that the model as a whole is highly statistically significant.Also there was a problem of heteroscedasticity but we used weighted least squares regression and based on the studentized Breusch-Pagan test, the p-value is 0.1737. The null hypothesis of the test is homoscedasticity (no heteroscedasticity), and the alternative hypothesis is heteroscedasticity.With a p-value of 0.1737, which is greater than the significance level of 0.05, we do not have sufficient evidence to reject the null hypothesis. This suggests that there is no significant evidence of heteroscedasticity in the model with weighted least squares regression (mod_wls).With a p-value of 0.1737, which is greater than the significance level of 0.05, we do not have sufficient evidence to reject the null hypothesis. This suggests that there is no significant evidence of heteroscedasticity in the model with weighted least squares regression (mod_wls).

In addition, the logistic regression model indicates that the ocean_proximity variable is a statistically significant predictor of the homes price, which really makes sense given that the neighborhood directly influences the price of houses. The projected change in price for each category, keeping other factors constant, may be deduced from the coefficients. As an illustration, being in the “ISLAND,” “NEAR BAY,” or “NEAR OCEAN” categories raises the log of the house’s price, however being in the “INLAND” category lowers it.



Now lets do clustering

We grouped the data points from the California housing dataset in this clustering study based on the variables “total_rooms” and “households.” These variables are two aspects of the housing data.

The goal of clustering was to group together comparable data points based on their values in these two variables. We selected clusters or groupings of data points with comparable values in terms of the total number of rooms and households using hierarchical clustering.

The clustering algorithm examines the data points for patterns and similarities and assigns them to distinct groupings. The clusters that form indicate different segments or groupings within the dataset. Each cluster comprises data points that are more similar in terms of “total_rooms” and “households” values than data points in other clusters.

The clustering results graphic shows how the data points are distributed in the feature space defined by “total_rooms” and “households.” Each cluster is allocated a different hue, making it simpler to detect the cluster boundaries and patterns.

We may learn about the different types of housing units in the dataset by evaluating the clustering findings, which are based on the total number of rooms and households. Clustering aids in the discovery of underlying patterns and structures in data, allowing for additional analysis and decision-making.


# Set the random seed for reproducibility
set.seed(123)

# Subset the dataset for clustering
cluster_data <- housing[, c("total_rooms", "households")]

# Standardize the numerical variables
scaled_data <- scale(cluster_data[, c("total_rooms", "households")])

# Calculate Euclidean distance and perform hierarchical clustering
dd <- dist(scaled_data, method = "euclidean")
clusters <- hclust(dd, method = "complete")

# Determine the optimal number of clusters using the elbow method
wss <- numeric(10)
for (i in 1:10) {
  kmeans_model <- kmeans(scaled_data, centers = i)
  wss[i] <- kmeans_model$tot.withinss
}

# Plot the elbow curve
plot(1:10, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters",
     ylab = "Within-Cluster Sum of Squares", ylim = c(min(wss, na.rm = TRUE), max(wss, na.rm = TRUE)))

# Determine the optimal number of clusters visually
k <- 3  # Set the desired number of clusters

# Perform clustering with the optimal number of clusters
cut <- cutree(clusters, k)

# Print the count of each ocean proximity category within each cluster
table(cut, housing$ocean_proximity)
##    
## cut <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
##   1      8898   6316      5     2241       2610
##   2       227    232      0       49         45
##   3        11      3      0        0          3
# Visualize the real categories of the data points
p = ggplot(cluster_data, aes(total_rooms, households))
p + geom_point(aes(colour = factor(housing$ocean_proximity)), size = 4) + ggtitle("Real Categories")

# Visualize the clustering results
p = ggplot(cluster_data, aes(total_rooms, households))
p + geom_point(aes(colour = factor(cut)), size = 4) + ggtitle("Clustering Results")

# Visualize the clustering results with a legend
fviz_cluster(list(data = cluster_data, cluster = cut), stand = FALSE, addlegend = "bottom")

According to the clustering result, the data points have been divided into three groups based on the variables “total_rooms” and “households.” Each cluster reflects a unique mix of total rooms and families. Clustering aids in the identification of patterns or similarities among data points and gives insights into the various segments or groupings found in the California housing dataset.

And the last but not the least, Supervised classification:

# Remove missing values
housing <- housing[complete.cases(housing), ]

# Supervised Classification - Predicting Housing Category
# Convert the housing category to a factor variable
housing$housing_median_age_category <- as.factor(ifelse(housing$housing_median_age > median(housing$housing_median_age), "old", "new"))

# Split the data into training and testing sets
set.seed(123)
train_indices <- sample(1:nrow(housing), 0.8 * nrow(housing))
train_data <- housing[train_indices, ]
test_data <- housing[-train_indices, ]

# Train a classification model (e.g., random forest)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
model <- randomForest(housing_median_age_category ~ ., data = train_data)

# Make predictions on the test data
predictions <- predict(model, newdata = test_data)

# Convert factor levels to match between predicted and actual values
test_data$housing_median_age_category <- factor(test_data$housing_median_age_category,
                                             levels = levels(train_data$housing_median_age_category))

# Evaluate the model and generate confusion matrix
confusionMatrix(predictions, test_data$housing_median_age_category)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  new  old
##        new 2077    0
##        old    0 2010
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9991, 1)
##     No Information Rate : 0.5082     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.5082     
##          Detection Rate : 0.5082     
##    Detection Prevalence : 0.5082     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : new        
## 

The confusion matrix provides information about the performance of the classification model. In this case, the model achieved perfect accuracy with an accuracy value of 1. The sensitivity and specificity values are also 1, indicating that the model correctly classified all instances of both “new” and “old” categories.The prevalence refers to the proportion of the “new” category in the dataset, which is 0.5082. Overall, the model shows excellent performance in classifying housing median age categories based on the available features in the dataset. The dataset is randomly split into training and testing sets using a specified seed value. 80% of the data is used for training the classification model, and the remaining 20% is used for testing. The threshold for categorizing a house as “new” or “old” is determined based on the median value of the “housing_median_age” variable. If a house’s median age is greater than the median value of the entire dataset, it is classified as “old”. Otherwise, it is classified as “new”.